> project_summary = "/Users/rory/cache/yimlamai-hepatocytes/2016-08-29_yimlamai/project-summary.csv"
> counts_file = "/Users/rory/cache/yimlamai-hepatocytes/2016-08-29_yimlamai/combined.counts"
> tx2genes_file = "/Users/rory/cache/yimlamai-hepatocytes/2016-08-29_yimlamai/tx2gene.csv"
> cbPalette <- c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2",
+ "#D55E00", "#CC79A7")
> summarydata = read.table(project_summary, header = TRUE, sep = ",")
> summarydata = summarydata[, colSums(is.na(summarydata)) < nrow(summarydata)]
> # handle newer bcbio-nextgen runs that use description as the key
> if ("description" %in% colnames(summarydata)) {
+ rownames(summarydata) = summarydata$description
+ summarydata$Name = rownames(summarydata)
+ summarydata$description = NULL
+ } else {
+ rownames(summarydata) = summarydata$Name
+ # summarydata$Name = NULL
+ }
> summarydata = summarydata[order(rownames(summarydata)), ]
> if (file.exists(tx2genes_file)) {
+ sample_dirs = file.path(dirname(project_summary), "..", rownames(summarydata))
+ salmon_files = file.path(sample_dirs, "salmon", "quant.sf")
+ sailfish_files = file.path(sample_dirs, "sailfish", "quant.sf")
+ new_sailfish = file.path(sample_dirs, "sailfish", "quant", "quant.sf")
+ new_salmon = file.path(sample_dirs, "salmon", "quant", "quant.sf")
+ if (file.exists(salmon_files[1])) {
+ sf_files = salmon_files
+ } else if (file.exists(sailfish_files[1])) {
+ sf_files = sailfish_files
+ } else if (file.exists(new_sailfish[1])) {
+ sf_files = new_sailfish
+ } else if (file.exists(new_salmon[1])) {
+ sf_files = new_salmon
+ }
+ names(sf_files) = rownames(summarydata)
+ tx2gene = read.table(tx2genes_file, sep = ",", row.names = NULL, header = FALSE)
+ txi.salmon = tximport(sf_files, type = "salmon", tx2gene = tx2gene, reader = readr::read_tsv,
+ countsFromAbundance = "lengthScaledTPM")
+ counts = round(data.frame(txi.salmon$counts, check.names = FALSE))
+ } else {
+ counts = read.table(counts_file, header = TRUE, row.names = "id", check.names = FALSE)
+ }
|======== | 10%
|========== | 12%
|=========== | 14%
|============= | 16%
|=============== | 18%
|================ | 21%
|================== | 23%
|==================== | 25%
|===================== | 27%
|======================= | 29% 1 MB
|========================= | 31% 1 MB
|=========================== | 33% 1 MB
|============================ | 35% 1 MB
|============================== | 37% 1 MB
|================================ | 40% 1 MB
|================================= | 42% 1 MB
|=================================== | 44% 1 MB
|===================================== | 46% 1 MB
|====================================== | 48% 1 MB
|======================================== | 50% 1 MB
|========================================== | 52% 1 MB
|=========================================== | 54% 1 MB
|============================================= | 57% 2 MB
|=============================================== | 59% 2 MB
|================================================= | 61% 2 MB
|================================================== | 63% 2 MB
|==================================================== | 65% 2 MB
|====================================================== | 67% 2 MB
|======================================================= | 69% 2 MB
|========================================================= | 71% 2 MB
|=========================================================== | 74% 2 MB
|============================================================ | 76% 2 MB
|============================================================== | 78% 2 MB
|================================================================ | 80% 2 MB
|================================================================= | 82% 2 MB
|=================================================================== | 84% 3 MB
|===================================================================== | 86% 3 MB
|======================================================================= | 88% 3 MB
|======================================================================== | 91% 3 MB
|========================================================================== | 93% 3 MB
|============================================================================ | 95% 3 MB
|============================================================================= | 97% 3 MB
|===============================================================================| 99% 3 MB
|================================================================================| 100% 3 MB
|========== | 12%
|=========== | 14%
|============= | 16%
|=============== | 18%
|================ | 21%
|================== | 23%
|==================== | 25%
|===================== | 27% 1 MB
|======================= | 29% 1 MB
|========================= | 31% 1 MB
|=========================== | 33% 1 MB
|============================ | 35% 1 MB
|============================== | 38% 1 MB
|================================ | 40% 1 MB
|================================= | 42% 1 MB
|=================================== | 44% 1 MB
|===================================== | 46% 1 MB
|====================================== | 48% 1 MB
|======================================== | 50% 1 MB
|========================================== | 52% 1 MB
|=========================================== | 54% 2 MB
|============================================= | 57% 2 MB
|=============================================== | 59% 2 MB
|================================================= | 61% 2 MB
|================================================== | 63% 2 MB
|==================================================== | 65% 2 MB
|====================================================== | 67% 2 MB
|======================================================= | 69% 2 MB
|========================================================= | 72% 2 MB
|=========================================================== | 74% 2 MB
|============================================================ | 76% 2 MB
|============================================================== | 78% 2 MB
|================================================================ | 80% 2 MB
|================================================================== | 82% 3 MB
|=================================================================== | 84% 3 MB
|===================================================================== | 86% 3 MB
|======================================================================= | 88% 3 MB
|======================================================================== | 91% 3 MB
|========================================================================== | 93% 3 MB
|============================================================================ | 95% 3 MB
|============================================================================= | 97% 3 MB
|===============================================================================| 99% 3 MB
|================================================================================| 100% 3 MB
|========== | 12%
|=========== | 14%
|============= | 16%
|=============== | 18%
|================ | 21%
|================== | 23%
|==================== | 25%
|===================== | 27%
|======================= | 29% 1 MB
|========================= | 31% 1 MB
|========================== | 33% 1 MB
|============================ | 35% 1 MB
|============================== | 37% 1 MB
|================================ | 40% 1 MB
|================================= | 42% 1 MB
|=================================== | 44% 1 MB
|===================================== | 46% 1 MB
|====================================== | 48% 1 MB
|======================================== | 50% 1 MB
|========================================== | 52% 1 MB
|=========================================== | 54% 1 MB
|============================================= | 57% 2 MB
|=============================================== | 59% 2 MB
|================================================= | 61% 2 MB
|================================================== | 63% 2 MB
|==================================================== | 65% 2 MB
|====================================================== | 67% 2 MB
|======================================================= | 69% 2 MB
|========================================================= | 71% 2 MB
|=========================================================== | 74% 2 MB
|============================================================ | 76% 2 MB
|============================================================== | 78% 2 MB
|================================================================ | 80% 2 MB
|================================================================== | 82% 2 MB
|=================================================================== | 84% 3 MB
|===================================================================== | 86% 3 MB
|======================================================================= | 88% 3 MB
|======================================================================== | 91% 3 MB
|========================================================================== | 93% 3 MB
|============================================================================ | 95% 3 MB
|============================================================================= | 97% 3 MB
|===============================================================================| 99% 3 MB
|================================================================================| 100% 3 MB
|========== | 12%
|=========== | 14%
|============= | 16%
|=============== | 18%
|================ | 21%
|================== | 23%
|==================== | 25%
|===================== | 27% 1 MB
|======================= | 29% 1 MB
|========================= | 31% 1 MB
|=========================== | 33% 1 MB
|============================ | 35% 1 MB
|============================== | 37% 1 MB
|================================ | 40% 1 MB
|================================= | 42% 1 MB
|=================================== | 44% 1 MB
|===================================== | 46% 1 MB
|====================================== | 48% 1 MB
|======================================== | 50% 1 MB
|========================================== | 52% 1 MB
|=========================================== | 54% 2 MB
|============================================= | 57% 2 MB
|=============================================== | 59% 2 MB
|================================================= | 61% 2 MB
|================================================== | 63% 2 MB
|==================================================== | 65% 2 MB
|====================================================== | 67% 2 MB
|======================================================= | 69% 2 MB
|========================================================= | 72% 2 MB
|=========================================================== | 74% 2 MB
|============================================================ | 76% 2 MB
|============================================================== | 78% 2 MB
|================================================================ | 80% 2 MB
|================================================================== | 82% 3 MB
|=================================================================== | 84% 3 MB
|===================================================================== | 86% 3 MB
|======================================================================= | 88% 3 MB
|======================================================================== | 91% 3 MB
|========================================================================== | 93% 3 MB
|============================================================================ | 95% 3 MB
|============================================================================= | 97% 3 MB
|===============================================================================| 99% 3 MB
|================================================================================| 100% 3 MB
|===== | 6%
|====== | 8%
|======== | 10%
|========== | 12%
|=========== | 14%
|============= | 16%
|=============== | 18%
|================ | 21%
|================== | 23%
|==================== | 25%
|===================== | 27%
|======================= | 29% 1 MB
|========================= | 31% 1 MB
|=========================== | 33% 1 MB
|============================ | 35% 1 MB
|============================== | 37% 1 MB
|================================ | 40% 1 MB
|================================= | 42% 1 MB
|=================================== | 44% 1 MB
|===================================== | 46% 1 MB
|====================================== | 48% 1 MB
|======================================== | 50% 1 MB
|========================================== | 52% 1 MB
|=========================================== | 54% 1 MB
|============================================= | 57% 2 MB
|=============================================== | 59% 2 MB
|================================================= | 61% 2 MB
|================================================== | 63% 2 MB
|==================================================== | 65% 2 MB
|====================================================== | 67% 2 MB
|======================================================= | 69% 2 MB
|========================================================= | 71% 2 MB
|=========================================================== | 74% 2 MB
|============================================================ | 76% 2 MB
|============================================================== | 78% 2 MB
|================================================================ | 80% 2 MB
|================================================================== | 82% 2 MB
|=================================================================== | 84% 3 MB
|===================================================================== | 86% 3 MB
|======================================================================= | 88% 3 MB
|======================================================================== | 91% 3 MB
|========================================================================== | 93% 3 MB
|============================================================================ | 95% 3 MB
|============================================================================= | 97% 3 MB
|===============================================================================| 99% 3 MB
|================================================================================| 100% 3 MB
|=== | 4%
|===== | 6%
|====== | 8%
|======== | 10%
|========== | 12%
|=========== | 14%
|============= | 16%
|=============== | 18%
|================ | 21%
|================== | 23%
|==================== | 25%
|===================== | 27%
|======================= | 29% 1 MB
|========================= | 31% 1 MB
|========================== | 33% 1 MB
|============================ | 35% 1 MB
|============================== | 37% 1 MB
|================================ | 40% 1 MB
|================================= | 42% 1 MB
|=================================== | 44% 1 MB
|===================================== | 46% 1 MB
|====================================== | 48% 1 MB
|======================================== | 50% 1 MB
|========================================== | 52% 1 MB
|=========================================== | 54% 2 MB
|============================================= | 57% 2 MB
|=============================================== | 59% 2 MB
|================================================= | 61% 2 MB
|================================================== | 63% 2 MB
|==================================================== | 65% 2 MB
|====================================================== | 67% 2 MB
|======================================================= | 69% 2 MB
|========================================================= | 71% 2 MB
|=========================================================== | 74% 2 MB
|============================================================ | 76% 2 MB
|============================================================== | 78% 2 MB
|================================================================ | 80% 2 MB
|================================================================== | 82% 3 MB
|=================================================================== | 84% 3 MB
|===================================================================== | 86% 3 MB
|======================================================================= | 88% 3 MB
|======================================================================== | 91% 3 MB
|========================================================================== | 93% 3 MB
|============================================================================ | 95% 3 MB
|============================================================================= | 97% 3 MB
|===============================================================================| 99% 3 MB
|================================================================================| 100% 3 MB
> counts = counts[, order(colnames(counts)), drop = FALSE]
> colnames(counts) = gsub(".counts", "", colnames(counts))
>
> # this is a list of all non user-supplied metadata columns that could appear
> known_columns = c("Name", "X.GC", "Exonic.Rate", "Sequences.flagged.as.poor.quality",
+ "rRNA_rate", "Fragment.Length.Mean", "Intronic.Rate", "Intergenic.Rate",
+ "Mapping.Rate", "Quality.format", "Duplication.Rate.of.Mapped", "Mapped",
+ "rRNA", "Sequence.length", "Transcripts.Detected", "Mean.Per.Base.Cov.",
+ "Genes.Detected", "Unique.Starts.Per.Read", "unique_starts_per_read", "complexity",
+ "X5.3.bias", "Duplicates.pct", "Duplicates", "Mapped.reads", "Average.insert.size",
+ "Mapped.reads.pct", "Total.reads", "avg_coverage_per_region", "Mapped.Reads")
> summarydata[, "Fragment.Length.Mean"] = summarydata$Average.insert.size
> summarydata$gt = paste(summarydata$genotype, summarydata$treatment, sep = "_")
> metadata = summarydata[, !colnames(summarydata) %in% known_columns, drop = FALSE]
> metadata = metadata[, colSums(is.na(metadata)) < nrow(metadata), drop = FALSE]
> sanitize_datatable = function(df, ...) {
+ # remove dashes which cause wrapping
+ DT::datatable(df, ..., rownames = gsub("-", "_", rownames(df)), colnames = gsub("-",
+ "_", colnames(df)))
+ }
> # set seed for reproducibility
> set.seed(1454944673)
> get_heatmap_fn = function(summarydata) {
+ # return the pheatmap function with or without metadata
+ if (ncol(metadata) == 0) {
+ return(pheatmap)
+ } else {
+ # rownames(metadata) = summarydata$Name
+ heatmap_fn = function(data, ...) {
+ pheatmap(data, annotation = metadata, clustering_method = "ward.D2",
+ clustering_distance_cols = "correlation", ...)
+ }
+ return(heatmap_fn)
+ }
+ }
> heatmap_fn = get_heatmap_fn(summarydata)
> qualimap_run = "Mapped" %in% colnames(summarydata)
> do_quality = "Mapped.reads" %in% colnames(summarydata)
Mapped reads looks great, there is some variation but that is to be expected.
> ggplot(summarydata, aes(x = Name, y = Mapped)) + theme_bw(base_size = 10) +
+ theme(panel.grid.major = element_line(size = 0.5, color = "grey"), axis.text.x = element_text(angle = 90)) +
+ geom_bar(stat = "identity") + ylab("mapped reads") + xlab("")
> ggplot(summarydata, aes(x = Name, y = Mapped.reads)) + theme_bw(base_size = 10) +
+ theme(panel.grid.major = element_line(size = 0.5, color = "grey"), axis.text.x = element_text(angle = 90)) +
+ geom_bar(stat = "identity") + ylab("mapped reads") + xlab("")
The genomic mapping rate looks excellent for all of the samples.
> ggplot(summarydata, aes(x = Name, y = Mapping.Rate)) + geom_bar(stat = "identity") +
+ ylab("mapping rate") + xlab("") + theme_bw(base_size = 10) + theme(panel.grid.major = element_line(size = 0.5,
+ color = "grey"), axis.text.x = element_text(angle = 90))
> ggplot(summarydata, aes(x = Name, y = Mapped.reads.pct)) + geom_bar(stat = "identity") +
+ ylab("mapping rate") + xlab("") + theme_bw(base_size = 10) + theme(panel.grid.major = element_line(size = 0.5,
+ color = "grey"), axis.text.x = element_text(angle = 90))
We see around the same number of genes detected in each sample, which is another good sign.
> dd = data.frame(Name = colnames(counts), Genes.Detected = colSums(counts > 0))
> ggplot(dd, aes(x = Name, y = Genes.Detected)) + geom_bar(stat = "identity") +
+ theme_bw(base_size = 10) + theme(panel.grid.major = element_line(size = 0.5,
+ color = "grey"), axis.text.x = element_text(angle = 90)) + ylab("genes detected") +
+ xlab("")
The exonic mapping rate is a measurement of how much RNA we actually sequenced; if the genomic mapping rate is super high and the exonic mapping rate is low, we have DNA contamination. There are a couple samples that look like they might have some DNA contamination, but nothing absolutely horrible.
> ggplot(summarydata, aes(x = Name, y = Exonic.Rate)) + geom_bar(stat = "identity") +
+ theme_bw(base_size = 10) + theme(panel.grid.major = element_line(size = 0.5,
+ color = "grey"), axis.text.x = element_text(angle = 90)) + ylab("exonic mapping rate") +
+ xlab("")
These rates are low for all samples, which is good.
> eval_rRNA = "rRNA_rate" %in% colnames(summarydata) & !sum(is.na(summarydata$rRNA_rate)) ==
+ nrow(summarydata)
> ggplot(summarydata, aes(x = Name, y = rRNA_rate)) + geom_bar(stat = "identity") +
+ theme_bw(base_size = 10) + theme(panel.grid.major = element_line(size = 0.5,
+ color = "grey"), axis.text.x = element_text(angle = 90)) + ylab("rRNA rate") +
+ xlab("")
This is round 1, which is good– if it deviates much from 1 then it is indicative of degradation of the RNA.
> ggplot(summarydata, aes(x = Name, y = X5.3.bias)) + geom_bar(stat = "identity") +
+ theme_bw(base_size = 10) + theme(panel.grid.major = element_line(size = 0.5,
+ color = "grey"), axis.text.x = element_text(angle = 90)) + ylab("5'->3' bias") +
+ xlab("")
The distributions of counts looks pretty similar across all of the samples.
> melted = melt(counts)
> colnames(melted) = c("sample", "count")
> melted$sample = factor(melted$sample)
> melted = melted[order(melted$sample), ]
> melted$count = log(melted$count)
> ggplot(melted, aes(x = sample, y = count)) + geom_boxplot() + theme_bw(base_size = 10) +
+ theme(panel.grid.major = element_line(size = 0.5, color = "grey"), axis.text.x = element_text(angle = 90)) +
+ xlab("")
Normalizing makes them much more similar, which is good. Trimmed mean of M-values (TMM) normalization is described here
Robinson, M. D., & Oshlack, A. (2010). A scaling normalization method for differential expression analysis of RNA-seq data. Genome Biology, 11(3). doi:10.1186/gb-2010-11-3-r25
> y = DGEList(counts = counts)
> y = calcNormFactors(y)
> normalized_counts = cpm(y, normalized.lib.sizes = TRUE)
> melted = melt(normalized_counts)
> colnames(melted) = c("gene", "sample", "count")
> melted$sample = factor(melted$sample)
> melted = melted[order(melted$sample), ]
> melted$count = log(melted$count)
> ggplot(melted, aes(x = sample, y = count)) + geom_boxplot() + theme_bw(base_size = 10) +
+ theme(panel.grid.major = element_line(size = 0.5, color = "grey"), axis.text.x = element_text(angle = 90)) +
+ xlab("")
Samples of the same genotype and treatment cluster together by Spearman correlation which is a good sign.
> heatmap_fn(cor(normalized_counts, method = "pearson"))
> heatmap_fn(cor(normalized_counts, method = "spearman"))
Samples separate very cleanly along the 1st and 2nd principal components by genotype and treatment. This is good news for differential expression analyses.
> dds = DESeqDataSetFromMatrix(countData = counts, colData = summarydata, design = ~Name)
> vst = varianceStabilizingTransformation(dds)
> pca_loadings = function(object, ntop = 500) {
+ rv <- matrixStats::rowVars(assay(object))
+ select <- order(rv, decreasing = TRUE)[seq_len(min(ntop, length(rv)))]
+ pca <- prcomp(t(assay(object)[select, ]))
+ percentVar <- pca$sdev^2/sum(pca$sdev^2)
+ names(percentVar) = colnames(pca$x)
+ pca$percentVar = percentVar
+ return(pca)
+ }
> pc = pca_loadings(vst)
> comps = data.frame(pc$x)
> comps$Name = rownames(comps)
> library(dplyr)
> comps = comps %>% left_join(summarydata, by = c(Name = "Name"))
> colorby = "Name"
> pca_plot = function(comps, nc1, nc2, colorby) {
+ c1str = paste0("PC", nc1)
+ c2str = paste0("PC", nc2)
+ if (!(c1str %in% colnames(comps) && c2str %in% colnames(comps))) {
+ warning("Higher order components not found, skipping plotting.")
+ return(NA)
+ }
+ ggplot(comps, aes_string(c1str, c2str, color = colorby)) + geom_point() +
+ theme_bw() + xlab(paste0(c1str, ": ", round(pc$percentVar[nc1] * 100),
+ "% variance")) + ylab(paste0(c2str, ": ", round(pc$percentVar[nc2] *
+ 100), "% variance"))
+ }
> pca_plot(comps, 1, 2, "gt")
> pca_plot(comps, 3, 4, "gt")
> pca_plot(comps, 5, 6, "gt")
> ggplot(data.frame(component = reorder(names(pc$percentVar), -pc$percentVar),
+ percent_var = pc$percentVar), aes(component, percent_var)) + geom_bar(stat = "identity") +
+ ylab("percent of total variation") + xlab("") + theme_bw()
> # snagged from development version of DESeq
> DESeqDataSetFromTximport <- function(txi, colData, design, ...) {
+ counts <- round(txi$counts)
+ mode(counts) <- "integer"
+ dds <- DESeqDataSetFromMatrix(countData = counts, colData = colData, design = design,
+ ...)
+ stopifnot(txi$countsFromAbundance %in% c("no", "scaledTPM", "lengthScaledTPM"))
+ if (txi$countsFromAbundance %in% c("scaledTPM", "lengthScaledTPM")) {
+ message("using length scaled TPM counts from tximport")
+ } else {
+ message("using counts and average transcript lengths from tximport")
+ lengths <- txi$length
+ dimnames(lengths) <- dimnames(dds)
+ assays(dds)[["avgTxLength"]] <- lengths
+ }
+ return(dds)
+ }
>
> subset_tximport = function(txi, rows, columns) {
+ txi$counts = txi$counts[rows, columns]
+ txi$abundance = txi$abundance[rows, columns]
+ txi$length = txi$length[rows, columns]
+ return(txi)
+ }
> deseq2resids = function(dds) {
+ # calculate residuals for a deseq2 fit
+ fitted = t(t(assays(dds)[["mu"]])/sizeFactors(dds))
+ return(counts(dds, normalized = TRUE) - fitted)
+ }
> library(DEGreport)
> library(vsn)
> design = ~genotype + treatment + genotype:treatment
> summarydata$treatment = relevel(summarydata$treatment, "untreated")
> summarydata$genotype = relevel(summarydata$genotype, "wt")
Here we fit a model that looks like ~genotype + treatment + genotype:treatment. The coefficient of genotype will be the effect due to it being a knockout, when there has been no treatment. The coefficient of treatment will be the effect of effect of treatment on the control cells. The coefficient of genotype:treatment will be the effect of treatment on the knockout cells.
> counts <- counts[rowSums(counts > 0) > 1, ]
> if (exists("txi.salmon")) {
+ txi.salmon = subset_tximport(txi.salmon, rownames(counts), colnames(counts))
+ dds = DESeqDataSetFromTximport(txi.salmon, colData = summarydata, design = design)
+ } else {
+ dds = DESeqDataSetFromMatrix(countData = counts, colData = summarydata,
+ design = design)
+ }
> geoMeans = apply(counts, 1, function(row) if (all(row == 0)) 0 else exp(mean(log(row[row !=
+ 0]))))
> dds = estimateSizeFactors(dds, geoMeans = geoMeans)
> dds = DESeq(dds)
These dispersion estimates look normal– the red line is the fitted value for the mean-variance relationship, the black dots are the gene-wise dispersion values and the blue dots are the gene-wise dispersions shrunk back to the fitted line.
> plotDispEsts(dds)
> library(biomaRt)
> biomart_dataset = "mmusculus_gene_ensembl"
> mart = biomaRt::useMart(biomart = "ensembl", dataset = biomart_dataset)
> symbols = biomaRt::getBM(attributes = c("ensembl_gene_id", "mgi_symbol"), mart = mart)
> entrez = biomaRt::getBM(attributes = c("ensembl_gene_id", "entrezgene"), mart = mart)
> entrez$entrezgene = as.character(entrez$entrezgene)
>
> plotMA_full = function(res) {
+ ymax = max(res$log2FoldChange, na.rm = TRUE)
+ ymin = min(res$log2FoldChange, na.rm = TRUE)
+ plotMA(res, ylim = c(ymin, ymax))
+ }
> volcano = function(res) {
+ stats = as.data.frame(res[, c(2, 6)])
+ p = volcano_density_plot(stats, lfc.cutoff = 1.5)
+ print(p)
+ }
> markup_deseq = function(res, fname) {
+ out_df = as.data.frame(res)
+ out_df$id = rownames(out_df)
+ out_df = out_df[, c("id", colnames(out_df)[colnames(out_df) != "id"])]
+ out_df = out_df %>% left_join(symbols, by = c(id = "ensembl_gene_id")) %>%
+ arrange(padj)
+ return(out_df)
+ }
> write_deseq = function(res, fname) {
+ out_df = as.data.frame(res)
+ out_df$id = rownames(out_df)
+ out_df = out_df[, c("id", colnames(out_df)[colnames(out_df) != "id"])]
+ out_df = out_df %>% left_join(symbols, by = c(id = "ensembl_gene_id")) %>%
+ arrange(padj)
+ write.table(out_df, file = fname, quote = FALSE, sep = ",", row.names = FALSE,
+ col.names = TRUE)
+ }
> of_interest = c("Tgfb1", "Tgfb2", "Pdgfc", "Il6")
> ko = results(dds, name = "genotype_yaptaz_vs_wt")
> komarked = markup_deseq(ko)
> plotMA_full(ko)
> volcano(ko)
TableGrob (2 x 2) "arrange": 3 grobs
z cells name grob
1 1 (2-2,1-1) arrange gtable[arrange]
2 2 (2-2,2-2) arrange gtable[arrange]
3 3 (1-1,1-2) arrange text[GRID.text.1119]
> write_deseq(ko, "knockout-effect.csv")
There are 308 genes affected by the knockout.
> treatwt = results(dds, name = "treatment_ccl4_vs_untreated")
> treatwtmarked = markup_deseq(treatwt)
> plotMA_full(treatwt)
> volcano(treatwt)
TableGrob (2 x 2) "arrange": 3 grobs
z cells name grob
1 1 (2-2,1-1) arrange gtable[arrange]
2 2 (2-2,2-2) arrange gtable[arrange]
3 3 (1-1,1-2) arrange text[GRID.text.1286]
> write_deseq(treatwt, "treatment-effect-on-wildtype.csv")
There are 513 genes affected by the CCL4 treatment in the wildtype.
> treatko = results(dds, list(c("treatment_ccl4_vs_untreated", "genotypeyaptaz.treatmentccl4")))
> treatkomarked = markup_deseq(treatko)
> plotMA_full(treatko)
> volcano(treatko)
TableGrob (2 x 2) "arrange": 3 grobs
z cells name grob
1 1 (2-2,1-1) arrange gtable[arrange]
2 2 (2-2,2-2) arrange gtable[arrange]
3 3 (1-1,1-2) arrange text[GRID.text.1453]
> write_deseq(treatko, "treatment-effect-on-knockout.csv")
There are 3176 genes affected by the CCL4 treatment in the knockout.
> treatkospecific = results(dds, name = "genotypeyaptaz.treatmentccl4")
> plotMA_full(treatkospecific)
> volcano(treatkospecific)
TableGrob (2 x 2) "arrange": 3 grobs
z cells name grob
1 1 (2-2,1-1) arrange gtable[arrange]
2 2 (2-2,2-2) arrange gtable[arrange]
3 3 (1-1,1-2) arrange text[GRID.text.1620]
> treatkospecificmarked = markup_deseq(treatkospecific)
> write_deseq(treatkospecific, "specific-effect-on-knockout.csv")
There are 796 genes specifically affected differently by the CCL4 treatment in the knockout compared to the wildtype.
These tables have the format:
id,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,mgi_symbol
> orgdb = "org.Mm.eg.db"
> biomart_dataset = "mmusculus_gene_ensembl"
> keggname = "mmu"
> library(dplyr)
> library(clusterProfiler)
> library(orgdb, character.only = TRUE)
> library(biomaRt)
> mart = biomaRt::useMart(biomart = "ensembl", dataset = biomart_dataset)
> entrez = biomaRt::getBM(attributes = c("ensembl_gene_id", "entrezgene"), mart = mart)
> entrez$entrezgene = as.character(entrez$entrezgene)
> entrezsymbol = biomaRt::getBM(attributes = c("mgi_symbol", "entrezgene"), mart = mart)
> entrezsymbol$entrezgene = as.character(entrezsymbol$entrezgene)
> summarize_cp = function(res, comparison) {
+ summaries = data.frame()
+ for (ont in names(res)) {
+ ontsum = summary(res[[ont]])
+ ontsum$ont = ont
+ summaries = rbind(summaries, ontsum)
+ }
+ summaries$comparison = comparison
+ return(summaries)
+ }
>
> enrich_cp = function(res, comparison) {
+ res = res %>% data.frame() %>% tibble::rownames_to_column() %>% left_join(entrez,
+ by = c(rowname = "ensembl_gene_id")) %>% filter(!is.na(entrezgene))
+ universe = res$entrezgene
+ genes = subset(res, padj < 0.05)$entrezgene
+ mf = enrichGO(genes, universe = universe, OrgDb = orgdb, ont = "MF", pAdjustMethod = "BH",
+ qvalueCutoff = 1, pvalueCutoff = 1)
+ cc = enrichGO(genes, universe = universe, OrgDb = orgdb, ont = "CC", pAdjustMethod = "BH",
+ qvalueCutoff = 1, pvalueCutoff = 1)
+ bp = enrichGO(genes, universe = universe, OrgDb = orgdb, ont = "BP", pAdjustMethod = "BH",
+ qvalueCutoff = 1, pvalueCutoff = 1)
+ kg = enrichKEGG(gene = genes, universe = universe, organism = "mmu", pvalueCutoff = 1,
+ qvalueCutoff = 1, pAdjustMethod = "BH")
+ all = list(mf = mf, cc = cc, bp = bp, kg = kg)
+ all[["summary"]] = summarize_cp(all, comparison)
+ return(all)
+ }
> gsea_cp = function(res, comparison) {
+ res = res %>% left_join(entrez, by = c(rowname = "ensembl_gene_id")) %>%
+ filter(!is.na(entrezgene)) %>% filter(!is.na(log2FoldChange)) %>% filter(!is.na(lfcSE))
+ lfc = data.frame(res)[, "log2FoldChange"]
+ lfcse = data.frame(res)[, "lfcSE"]
+ genes = lfc/lfcse
+ names(genes) = res$entrezgene
+ genes = genes[order(genes, decreasing = TRUE)]
+ cc = gseGO(genes, ont = "CC", OrgDb = orgdb, nPerm = 500, pvalueCutoff = 1,
+ pAdjustMethod = "BH", verbose = TRUE)
+ mf = gseGO(genes, ont = "MF", OrgDb = orgdb, nPerm = 500, pvalueCutoff = 1,
+ pAdjustMethod = "BH", verbose = TRUE)
+ bp = gseGO(genes, ont = "bp", OrgDb = orgdb, nPerm = 500, pvalueCutoff = 1,
+ pAdjustMethod = "BH", verbose = TRUE)
+ genes = data.frame(res)[, "log2FoldChange"]
+ names(genes) = res$entrezgene
+ genes = genes[order(genes, decreasing = TRUE)]
+ genes = genes[!is.na(genes)]
+ kg = gseKEGG(geneList = genes, organism = keggname, nPerm = 500, pvalueCutoff = 1,
+ verbose = TRUE)
+ if (orgdb == "org.Hs.eg.db") {
+ do = summary(gseDO(geneList = genes, nPerm = 500, pvalueCutoff = 1,
+ pAdjustMethod = "BH", verbose = TRUE))
+ do$ont = "DO"
+ all = list(mf = mf, cc = cc, bp = bp, kg = kg, do = do)
+ } else {
+ all = list(mf = mf, cc = cc, bp = bp, kg = kg)
+ }
+ all[["summary"]] = summarize_cp(all, comparison)
+ return(all)
+ }
>
> convert_core_ids = function(res) {
+ res = res %>% mutate(geneID = strsplit(as.character(geneID), "/")) %>% tidyr::unnest(geneID) %>%
+ left_join(entrezsymbol, by = c(geneID = "entrezgene")) %>% group_by(ID,
+ Description, GeneRatio, BgRatio, pvalue, p.adjust, qvalue, Count, ont,
+ comparison) %>% summarise(geneID = paste(geneID, collapse = "/"), symbol = paste(mgi_symbol,
+ collapse = "/"))
+ return(res)
+ }
> direction_breakdown = function(res, deseq2res) {
+ ocols = colnames(res)
+ deseq2res = deseq2res %>% data.frame() %>% tibble::rownames_to_column() %>%
+ dplyr::select(log2FoldChange, rowname)
+ ids = res %>% mutate(symbol = strsplit(as.character(symbol), "/")) %>% tidyr::unnest(symbol) %>%
+ left_join(symbols, by = c(symbol = "mgi_symbol")) %>% left_join(deseq2res,
+ by = c(ensembl_gene_id = "rowname")) %>% mutate(direction = ifelse(log2FoldChange >
+ 0, "up", "down"))
+ ids = ids[, c(ocols, "direction")]
+ ids = ids %>% group_by(ID, Description, GeneRatio, BgRatio, pvalue, p.adjust,
+ qvalue, Count, ont, pvalue, p.adjust, qvalue, comparison, direction) %>%
+ summarise(symbols = paste(symbol, collapse = "/"))
+ return(ids)
+ }
> library(readr)
> ko_enrich = enrich_cp(ko, "knockout-effect")
> ko_out = convert_core_ids(subset(ko_enrich$summary, qvalue < 0.05))
> ko_out = direction_breakdown(ko_out, ko)
> write_csv(ko_out, "knockout-effect-pathways.csv")
> treatwt_enrich = enrich_cp(treatwt, "treatment-effect-on-wt")
> treatwt_out = convert_core_ids(subset(treatwt_enrich$summary, qvalue < 0.05))
> treatwt_out = direction_breakdown(treatwt_out, treatwt)
> treatwt_int = read_csv("followup/treatment-wt.txt", col_names = "ID")
> treatwt_out$interesting = treatwt_out$ID %in% treatwt_int$ID
> write_csv(treatwt_out, "treatment-effect-on-wt-pathways.csv")
> treatko_enrich = enrich_cp(treatko, "treatment-effect-on-ko")
> treatko_out = convert_core_ids(subset(treatko_enrich$summary, qvalue < 0.05))
> treatko_out = direction_breakdown(treatko_out, treatko)
> treatko_int = read_csv("followup/treatment-ko.txt", col_names = "ID")
> treatko_out$interesting = treatko_out$ID %in% treatko_int$ID
> write_csv(treatko_out, "treatment-effect-on-ko-pathways.csv")
> treatkospecific_enrich = enrich_cp(treatkospecific, "specific-treatment-effect-on-ko")
> treatkospecific_out = convert_core_ids(subset(treatkospecific_enrich$summary,
+ qvalue < 0.05))
> treatkospecific_out = direction_breakdown(treatkospecific_out, treatkospecific)
> treatkospecific_int = read_csv("followup/treatment-specific-ko.txt", col_names = "ID")
> treatkospecific_out$interesting = treatkospecific_out$ID %in% treatkospecific_int$ID
> write_csv(treatkospecific_out, "specific-effect-on-ko-pathways.csv")
Treatment effect on wildtype, pathways
> tpm = txi.salmon$abundance %>% as.data.frame() %>% tibble::rownames_to_column() %>%
+ left_join(symbols, by = c(rowname = "ensembl_gene_id"))
> write_csv(tpm, "tpm.csv")
> summarydata$sample = rownames(summarydata)
> il1bvals = tpm %>% dplyr::filter(mgi_symbol == "Il1b") %>% tidyr::gather(sample,
+ tpm, -mgi_symbol, -rowname) %>% left_join(summarydata, by = "sample")
> ggplot(il1bvals, aes(sample, tpm)) + geom_point() + theme(axis.text.x = element_text(angle = 90,
+ hjust = 1))
> ggplot(subset(il1bvals, tpm < 50), aes(sample, tpm)) + geom_point() + theme(axis.text.x = element_text(angle = 90,
+ hjust = 1))
Dean posed this question:
Could you check if our Hippo gene signature is significantly changed in the
context of injury. Is it normalized after injury when Yap/Taz are knocked out? I
have included the a file called "Yap Targets.grp" which documents genes
typically changed in cell lines when Hippo Signaling is inactivated. Usually we
use the GSEA suite to examine the significance.
We could do GSEA, but it will make it tough to answer the question ‘is it normalized’. Here we will try to answer that question by measuring the distance between all samples via PCA, using those genes to query. The file we were sent is in human symbol form, though. So we have to convert that first:
> library(biomaRt)
> human = useMart("ensembl", dataset = "hsapiens_gene_ensembl")
> mouse = useMart("ensembl", dataset = "mmusculus_gene_ensembl")
> symbolconv = getLDS(attributes = c("mgi_symbol", "ensembl_gene_id"), mart = mouse,
+ attributesL = c("hgnc_symbol"), martL = human)
> colnames(symbolconv) = c("mouse", "id", "human")
> target_fn = "/Volumes/Clotho/Users/rory/cache/yimlamai-hepatocytes/metadata/yap-targets.grp"
> extras = data.frame(symbol = c("NOCT", "ASAP1", "COLGAT1", "IQCJ-SCHIP1", "AGFG2"))
> targets = read_csv(target_fn, skip = 1, col_names = c("symbol"))
> targets = rbind(targets, extras)
> targets = targets %>% left_join(symbolconv, by = c(symbol = "human"))
> genes = unique(targets$id)[!is.na(unique(targets$id))]
> genes = genes[genes %in% rownames(counts)]
Below at the PCA plots using those genes. This doesn’t look too good for the hypothesis that treating the Yap/Taz knockout samples normalizes these genes after injury. If this made the Yap/Taz cells more like the WT cells, we’d expect them to overlap. Instead we see the CCL4 treated YapTaz knockout cells are further away.
> dds = DESeqDataSetFromMatrix(countData = counts, colData = summarydata, design = ~Name)
> dds = estimateSizeFactors(dds)
> vst = varianceStabilizingTransformation(dds)
> pca_yap = function(object, genes) {
+ pca <- prcomp(t(assay(object)[genes, ]))
+ percentVar <- pca$sdev^2/sum(pca$sdev^2)
+ names(percentVar) = colnames(pca$x)
+ pca$percentVar = percentVar
+ return(pca)
+ }
> pc = pca_yap(vst, genes)
> comps = data.frame(pc$x)
> comps$Name = rownames(comps)
> library(dplyr)
> comps = comps %>% left_join(summarydata, by = c(Name = "Name"))
> colorby = "Name"
> pca_plot(comps, 1, 2, "gt")
Heirarchical clustering also doesn’t bear out this hypothesis, using the normalized counts:
> ncounts = log(counts(dds, normalized = TRUE) + 1)
> toheatmap = ncounts[genes, ] %>% as.data.frame() %>% tibble::rownames_to_column() %>%
+ left_join(targets, by = c(rowname = "id")) %>% na.omit()
> toplot = data.frame(toheatmap, check.names = FALSE)
> rownames(toplot) = toplot$mouse
> toplot = toplot[, !colnames(toplot) %in% c("symbol", "mouse", "id", "rowname")]
> heatmap_fn(toplot, fontsize = 7)
This is the same as above, but using the Piccolo lab genes:
> library(biomaRt)
> human = useMart("ensembl", dataset = "hsapiens_gene_ensembl")
> mouse = useMart("ensembl", dataset = "mmusculus_gene_ensembl")
> symbolconv = getLDS(attributes = c("mgi_symbol", "ensembl_gene_id"), mart = mouse,
+ attributesL = c("hgnc_symbol"), martL = human)
> colnames(symbolconv) = c("mouse", "id", "human")
> target_fn = "/Volumes/Clotho/Users/rory/cache/yimlamai-hepatocytes/metadata/piccolo.txt"
> targets = read_csv(target_fn, col_names = c("symbol"))
> targets = targets %>% left_join(symbolconv, by = c(symbol = "human"))
> genes = unique(targets$id)[!is.na(unique(targets$id))]
> genes = genes[genes %in% rownames(counts)]
We sees a similar pattern as the above, CCL4 treatment make the YapTaz cells more different than the untreated cells, not less.
> dds = DESeqDataSetFromMatrix(countData = counts, colData = summarydata, design = ~Name)
> dds = estimateSizeFactors(dds)
> vst = varianceStabilizingTransformation(dds)
> pca_yap = function(object, genes) {
+ pca <- prcomp(t(assay(object)[genes, ]))
+ percentVar <- pca$sdev^2/sum(pca$sdev^2)
+ names(percentVar) = colnames(pca$x)
+ pca$percentVar = percentVar
+ return(pca)
+ }
> pc = pca_yap(vst, genes)
> comps = data.frame(pc$x)
> comps$Name = rownames(comps)
> library(dplyr)
> comps = comps %>% left_join(summarydata, by = c(Name = "Name"))
> colorby = "Name"
> pca_plot(comps, 1, 2, "gt")
Heirarchical clustering also doesn’t bear out this hypothesis.
> ncounts = log(counts(dds, normalized = TRUE) + 1)
> toheatmap = ncounts[genes, ] %>% as.data.frame() %>% tibble::rownames_to_column() %>%
+ left_join(targets, by = c(rowname = "id")) %>% na.omit()
> toplot = data.frame(toheatmap, check.names = FALSE)
> rownames(toplot) = toplot$mouse
> toplot = toplot[, !colnames(toplot) %in% c("symbol", "mouse", "id", "rowname")]
> heatmap_fn(toplot, fontsize = 7)
Here we try to find genes that are markers for each of the four groups. By markers we mean genes that are up or down only in that specific group compared to all of the other groups. Since we only have four groups, we can do this by looking, for each group, at all pairwise comparisons with the other groups, and then picking out the genes which are significantly up or down in a given group compared to each of the other groups. The heatmaps are of the logged, normalized counts for these genes in each sample.
To simplify the output, we took those genes and did a differential expression test of the gene in the given sample group against the average of all the other samples. We didn’t use this to select the genes, it is just to have an idea of which direction and an overall estimate of the magnitude of the change in gene expression in the particular group. These are available at the bottom in the section Marker gene tables
> design = ~gt
> counts <- counts[rowSums(counts > 0) > 1, ]
> if (exists("txi.salmon")) {
+ txi.salmon = subset_tximport(txi.salmon, rownames(counts), colnames(counts))
+ dds = DESeqDataSetFromTximport(txi.salmon, colData = summarydata, design = design)
+ } else {
+ dds = DESeqDataSetFromMatrix(countData = counts, colData = summarydata,
+ design = design)
+ }
> geoMeans = apply(counts, 1, function(row) if (all(row == 0)) 0 else exp(mean(log(row[row !=
+ 0]))))
> dds = estimateSizeFactors(dds, geoMeans = geoMeans)
> dds = DESeq(dds)
> comp1 = results(dds, contrast = c("gt", "wt_untreated", "wt_ccl4"))
> comp2 = results(dds, contrast = c("gt", "wt_untreated", "yaptaz_ccl4"))
> comp3 = results(dds, contrast = c("gt", "wt_untreated", "yaptaz_untreated"))
> genes = rownames(subset(comp1, padj < 0.05))
> genes = intersect(genes, rownames(subset(comp2, padj < 0.05)))
> genes = intersect(genes, rownames(subset(comp3, padj < 0.05)))
> allup = (comp1[genes, "log2FoldChange"] > 0) & (comp2[genes, "log2FoldChange"] >
+ 0) & (comp3[genes, "log2FoldChange"] > 0)
> alldown = (comp1[genes, "log2FoldChange"] < 0) & (comp2[genes, "log2FoldChange"] <
+ 0) & (comp3[genes, "log2FoldChange"] < 0)
> samedir = allup | alldown
> wtmarkers = genes[samedir]
> heatmap_fn(log(ncounts[wtmarkers, ] + 1))
> combined = results(dds, contrast = list(c("gtwt_untreated"), c("gtwt_ccl4",
+ "gtyaptaz_ccl4", "gtyaptaz_untreated")), listValues = c(1, -1/3))
> combined = combined[wtmarkers, ]
> write_deseq(combined, "wt-untreated-markers.csv")
> comp1 = results(dds, contrast = c("gt", "wt_ccl4", "wt_untreated"))
> comp2 = results(dds, contrast = c("gt", "wt_ccl4", "yaptaz_ccl4"))
> comp3 = results(dds, contrast = c("gt", "wt_ccl4", "yaptaz_untreated"))
> genes = rownames(subset(comp1, padj < 0.05))
> genes = intersect(genes, rownames(subset(comp2, padj < 0.05)))
> genes = intersect(genes, rownames(subset(comp3, padj < 0.05)))
> allup = (comp1[genes, "log2FoldChange"] > 0) & (comp2[genes, "log2FoldChange"] >
+ 0) & (comp3[genes, "log2FoldChange"] > 0)
> alldown = (comp1[genes, "log2FoldChange"] < 0) & (comp2[genes, "log2FoldChange"] <
+ 0) & (comp3[genes, "log2FoldChange"] < 0)
> samedir = allup | alldown
> wtccl4markers = genes[samedir]
> heatmap_fn(log(ncounts[wtccl4markers, ] + 1))
> combined = results(dds, contrast = list(c("gtwt_ccl4"), c("gtwt_untreated",
+ "gtyaptaz_ccl4", "gtyaptaz_untreated")), listValues = c(1, -1/3))
> combined = combined[wtccl4markers, ]
> write_deseq(combined, "wt-ccl4-markers.csv")
> comp1 = results(dds, contrast = c("gt", "yaptaz_untreated", "wt_ccl4"))
> comp2 = results(dds, contrast = c("gt", "yaptaz_untreated", "yaptaz_ccl4"))
> comp3 = results(dds, contrast = c("gt", "yaptaz_untreated", "wt_untreated"))
> genes = rownames(subset(comp1, padj < 0.05))
> genes = intersect(genes, rownames(subset(comp2, padj < 0.05)))
> genes = intersect(genes, rownames(subset(comp3, padj < 0.05)))
> allup = (comp1[genes, "log2FoldChange"] > 0) & (comp2[genes, "log2FoldChange"] >
+ 0) & (comp3[genes, "log2FoldChange"] > 0)
> alldown = (comp1[genes, "log2FoldChange"] < 0) & (comp2[genes, "log2FoldChange"] <
+ 0) & (comp3[genes, "log2FoldChange"] < 0)
> samedir = allup | alldown
> yaptazmarkers = genes[samedir]
> heatmap_fn(log(ncounts[yaptazmarkers, ] + 1))
> combined = results(dds, contrast = list(c("gtyaptaz_untreated"), c("gtwt_untreated",
+ "gtyaptaz_ccl4", "gtwt_ccl4")), listValues = c(1, -1/3))
> combined = combined[yaptazmarkers, ]
> write_deseq(combined, "yaptaz-untreated-markers.csv")
> comp1 = results(dds, contrast = c("gt", "yaptaz_ccl4", "wt_ccl4"))
> comp2 = results(dds, contrast = c("gt", "yaptaz_ccl4", "yaptaz_untreated"))
> comp3 = results(dds, contrast = c("gt", "yaptaz_ccl4", "wt_untreated"))
> genes = rownames(subset(comp1, padj < 0.05 & abs(log2FoldChange) > 3))
> genes = intersect(genes, rownames(subset(comp2, padj < 0.05 & abs(log2FoldChange) >
+ 3)))
> genes = intersect(genes, rownames(subset(comp3, padj < 0.05 & abs(log2FoldChange) >
+ 3)))
> allup = (comp1[genes, "log2FoldChange"] > 0) & (comp2[genes, "log2FoldChange"] >
+ 0) & (comp3[genes, "log2FoldChange"] > 0)
> alldown = (comp1[genes, "log2FoldChange"] < 0) & (comp2[genes, "log2FoldChange"] <
+ 0) & (comp3[genes, "log2FoldChange"] < 0)
> samedir = allup | alldown
> yaptaztreatedmarkers = genes[samedir]
> heatmap_fn(log(ncounts[yaptaztreatedmarkers, ] + 1), fontsize = 8)
> combined = results(dds, contrast = list(c("gtyaptaz_ccl4"), c("gtwt_untreated",
+ "gtyaptaz_untreated", "gtwt_ccl4")), listValues = c(1, -1/3))
> combined = combined[yaptaztreatedmarkers, ]
> write_deseq(combined, "yaptaz-ccl4-markers.csv")
Here we look at the data in a slightly different way. Instead of looking at interactions of genotype and treatment, we do pairwise comparisons between the groups. This is a lot easier to think about, but makes the question question “which genes are specifically affected by treatment in the KO” a lot harder to answer.
This compares the KO untreated samples to the WT untreated samples. Here, positive log2 fold changes
> ko = results(dds, contrast = c("gt", "yaptaz_untreated", "wt_untreated"), addMLE = TRUE)
> komarked = markup_deseq(ko)
> plotMA_full(ko)
> volcano(ko)
TableGrob (2 x 2) "arrange": 3 grobs
z cells name grob
1 1 (2-2,1-1) arrange gtable[arrange]
2 2 (2-2,2-2) arrange gtable[arrange]
3 3 (1-1,1-2) arrange text[GRID.text.2193]
> write_deseq(ko, "pairwise-ko-untreated-vs-wt-untreated.csv")
There are 266 genes different in the untreated KO samples compared to the untreated WT samples.
> treatwt = results(dds, contrast = c("gt", "wt_ccl4", "wt_untreated"), addMLE = TRUE)
> treatwtmarked = markup_deseq(treatwt)
> plotMA_full(treatwt)
> write_deseq(treatwt, "pairwise-wt-treated-vs-wt-untreated.csv")
There are 441 genes different in the WT treated samples compared to the WT untreated samples.
> treatko = results(dds, contrast = c("gt", "yaptaz_ccl4", "yaptaz_untreated"),
+ addMLE = TRUE)
> treatkomarked = markup_deseq(treatko)
> plotMA_full(treatko)
> volcano(treatko)
TableGrob (2 x 2) "arrange": 3 grobs
z cells name grob
1 1 (2-2,1-1) arrange gtable[arrange]
2 2 (2-2,2-2) arrange gtable[arrange]
3 3 (1-1,1-2) arrange text[GRID.text.2527]
> write_deseq(treatko, "pairwise-ko-treated-vs-ko-untreated.csv")
There are 2868 genes different in the KO treated samples compared to the KO untreated samples.
Here is a Venn diagram of the overlaps of genes called significant in each of these comparisons.
> kosig = subset(ko, padj < 0.05)
> treatwtsig = subset(treatwt, padj < 0.05)
> treatkosig = subset(treatko, padj < 0.05)
> library(gplots)
> venn(list(`WT treated vs WT untreated` = rownames(treatwtsig), `KO untreated vs WT untreated` = rownames(kosig),
+ `KO treated vs KO untreated` = rownames(treatkosig)))
Here we repeat the pathway analysis, but using the three sets of genes identified by the pairwise interactions as input.
> library(readr)
> ko_enrich = enrich_cp(ko, "knockout-effect")
> ko_out = convert_core_ids(subset(ko_enrich$summary, qvalue < 0.05))
> ko_out = direction_breakdown(ko_out, ko)
> write_csv(ko_out, "pairwise-ko-untreated-vs-wt-untreated-pathways.csv")
> treatwt_enrich = enrich_cp(treatwt, "treatment-effect-on-wt")
> treatwt_out = convert_core_ids(subset(treatwt_enrich$summary, qvalue < 0.05))
> treatwt_out = direction_breakdown(treatwt_out, treatwt)
> treatwt_int = read_csv("followup/treatment-wt.txt", col_names = "ID")
> treatwt_out$interesting = treatwt_out$ID %in% treatwt_int$ID
> write_csv(treatwt_out, "pairwise-wt-treated-vs-wt-untreated-pathways.csv")
> treatko_enrich = enrich_cp(treatko, "treatment-effect-on-ko")
> treatko_out = convert_core_ids(subset(treatko_enrich$summary, qvalue < 0.05))
> treatko_out = direction_breakdown(treatko_out, treatko)
> treatko_int = read_csv("followup/treatment-ko.txt", col_names = "ID")
> treatko_out$interesting = treatko_out$ID %in% treatko_int$ID
> write_csv(treatko_out, "pairwise-ko-treated-vs-ko-untreated-pathways.csv")
KO untreated vs WT untreated, pathways
Here we compare the effect of the knockout identified by doing the pairwise comparisons and the effect identified by looking at the “ko” term in the interaction model:
> pairwise = read_csv("pairwise-ko-untreated-vs-wt-untreated.csv")
> interaction = read_csv("knockout-effect.csv")
> combined = pairwise %>% left_join(interaction, by = "id") %>% dplyr::select(id,
+ log2FoldChange.y, lfcMLE)
> colnames(combined) = c("id", "interaction", "paired")
> ggplot(combined, aes(interaction, paired)) + geom_point()
Here we compare the effect of the treatment on the wildtype that we identified by doing the pairwise comparisons and the effect identified by looking at the “treatment” term in the interaction model:
> pairwise = read_csv("pairwise-wt-treated-vs-wt-untreated.csv")
> interaction = read_csv("treatment-effect-on-wildtype.csv")
> combined = pairwise %>% left_join(interaction, by = "id") %>% dplyr::select(id,
+ log2FoldChange.y, lfcMLE)
> colnames(combined) = c("id", "interaction", "paired")
> ggplot(combined, aes(interaction, paired)) + geom_point()
Here we compare the effect of the treatment on the knockout that we identified by doing the pairwise comparisons of the KO treated to the KO untreated and the effect identified by looking at the combination of the main treatment effect and the treatment effect specific to the KO.
> pairwise = read_csv("pairwise-ko-treated-vs-ko-untreated.csv")
> interaction = read_csv("treatment-effect-on-knockout.csv")
> combined = pairwise %>% left_join(interaction, by = "id") %>% dplyr::select(id,
+ log2FoldChange.y, lfcMLE)
> colnames(combined) = c("id", "interaction", "paired")
> ggplot(combined, aes(interaction, paired)) + geom_point()